
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
data = pd.read_csv("https://raw.githubusercontent.com/globaldothealth/monkeypox/main/latest.csv",low_memory=False)
print(data.shape)
(39001, 36)
data.head()
| ID | Status | Location | City | Country | Country_ISO3 | Age | Gender | Date_onset | Date_confirmation | ... | Source | Source_II | Source_III | Source_IV | Source_V | Source_VI | Source_VII | Date_entry | Date_death | Date_last_modified | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N1 | confirmed | Guy's and St Thomas Hospital London | London | England | GBR | NaN | NaN | 2022-04-29 | 2022-05-06 | ... | https://www.gov.uk/government/news/monkeypox-c... | https://www.who.int/emergencies/disease-outbre... | NaN | NaN | NaN | NaN | NaN | 2022-05-18 | NaN | 2022-05-18 |
| 1 | N2 | confirmed | Guy's and St Thomas Hospital London | London | England | GBR | NaN | NaN | 2022-05-05 | 2022-05-12 | ... | https://www.gov.uk/government/news/monkeypox-c... | NaN | NaN | NaN | NaN | NaN | NaN | 2022-05-18 | NaN | 2022-05-18 |
| 2 | N3 | confirmed | London | London | England | GBR | NaN | NaN | 2022-04-30 | 2022-05-13 | ... | https://www.gov.uk/government/news/monkeypox-c... | NaN | NaN | NaN | NaN | NaN | NaN | 2022-05-18 | NaN | 2022-05-18 |
| 3 | N4 | confirmed | London | London | England | GBR | NaN | male | NaN | 2022-05-15 | ... | https://www.gov.uk/government/news/monkeypox-c... | NaN | NaN | NaN | NaN | NaN | NaN | 2022-05-18 | NaN | 2022-05-18 |
| 4 | N5 | confirmed | London | London | England | GBR | NaN | male | NaN | 2022-05-15 | ... | https://www.gov.uk/government/news/monkeypox-c... | NaN | NaN | NaN | NaN | NaN | NaN | 2022-05-18 | NaN | 2022-05-18 |
5 rows × 36 columns
# Create dataframe counting NaN values per column
nan_df = pd.DataFrame(data.isna().sum()).reset_index()
nan_df.columns = ['Column', 'NaN_Count']
nan_df['NaN_Count'] = nan_df['NaN_Count'].astype('int')
nan_df['NaN_%'] = round(nan_df['NaN_Count']/data.shape[0] * 100,1)
nan_df['Type'] = 'Missingness'
nan_df.sort_values('NaN_%', inplace = True)
# Add completeness: 1- missingness%
for i in range(nan_df.shape[0]):
complete_df = pd.DataFrame([nan_df.loc[i,'Column'],data.shape[0] - nan_df.loc[i,'NaN_Count'],100 - nan_df.loc[i,'NaN_%'], 'Completeness']).T
complete_df.columns = ['Column','NaN_Count','NaN_%','Type']
complete_df['NaN_%'] = complete_df['NaN_%'].astype('int')
complete_df['NaN_Count'] = complete_df['NaN_Count'].astype('int')
nan_df = nan_df.append(complete_df, sort = True)
nan_df.head()
| Column | NaN_% | NaN_Count | Type | |
|---|---|---|---|---|
| 0 | ID | 0.0 | 0 | Missingness |
| 33 | Date_entry | 0.0 | 0 | Missingness |
| 26 | Source | 0.0 | 0 | Missingness |
| 5 | Country_ISO3 | 0.0 | 0 | Missingness |
| 35 | Date_last_modified | 0.0 | 0 | Missingness |
# Missingness Plot
fig = px.bar(nan_df,
x = 'Column',
y = 'NaN_%',
title = 'Missingness within this Dataset',
color = 'Type',
template = 'plotly_dark',
opacity = 0.6,
width = 800,
height = 450,
color_discrete_sequence = ['#dbdbdb','#38cae0']
)
fig.update_yaxes(title = 'Percentage of NaNs')
fig.update_xaxes(title = 'Column Name')
plt.show(block = False)
fig.show()
# we can pull confirmed cases
confirmed_data = data.loc[data["Status"] == "confirmed"]
data_df = confirmed_data[['Country_ISO3','ID']].groupby('Country_ISO3').agg('count').reset_index()
data_df.rename({'ID':'Total'}, axis = 1, inplace = True)
data_df.head()## Country_ISO3 are three-letter country codes
| Country_ISO3 | Total | |
|---|---|---|
| 0 | AND | 4 |
| 1 | ARE | 16 |
| 2 | ARG | 49 |
| 3 | AUS | 71 |
| 4 | AUT | 198 |
fig = px.choropleth(data_df,
locations = "Country_ISO3",
color = "Total",
hover_name = "Country_ISO3",
color_continuous_scale = "peach",
projection = 'orthographic',
template = 'plotly_dark',
title = 'Geographical Distribution of Confirmed Monkeypox Cases<br><sub>Natural Projection</sub>',
height = 450,
width = 800,
)
fig.update_geos(lataxis_showgrid = True,
lonaxis_showgrid = True,
showcountries = True,
)
fig.update_geos(lataxis = {'gridcolor':'#222222'},
lonaxis = {'gridcolor':'#222222'},
)
plt.show(block = False)
fig.show()
fig = px.scatter_geo(data_df, locations="Country_ISO3",
color="Total",
hover_name="Country_ISO3",
size="Total",
title = 'Geographical Distribution of Confirmed Monkeypox Cases using Bubble Maps',
projection="natural earth",
color_continuous_scale = ['#06FF00','#FFE400','#FF8E00','#FF1700'],
template = 'plotly_dark',
height = 450,
width = 800,
)
plt.show(block = False)
fig.show()
fig = px.choropleth(data_df,
locations = "Country_ISO3",
color = "Total",
hover_name = "Country_ISO3",
scope = 'europe',
color_continuous_scale = "peach",
template = 'plotly_dark',
title = 'Distribution of Confirmed Monkeypox Cases<br><sub>European </sub>',
height = 450,
width = 800,
)
fig.update_geos(lataxis_showgrid = True,
lonaxis_showgrid = True,
showcountries = True,
showsubunits = True,
)
fig.update_geos(lataxis = {'gridcolor':'#222222'},
lonaxis = {'gridcolor':'#222222'},
)
plt.show(block = False)
fig.show()
acumulated_df = data[['Date_confirmation','ID']].groupby('Date_confirmation').agg('count').reset_index()
acumulated_df['Accumulated Cases'] = acumulated_df['ID'].cumsum()
acumulated_df.rename({'ID':'Count'}, axis = 1, inplace = True)
acumulated_df.head()
| Date_confirmation | Count | Accumulated Cases | |
|---|---|---|---|
| 0 | 2022-01-31 | 2 | 2 |
| 1 | 2022-02-17 | 3 | 5 |
| 2 | 2022-02-28 | 1 | 6 |
| 3 | 2022-03-04 | 2 | 8 |
| 4 | 2022-03-31 | 6 | 14 |
fig = px.bar(acumulated_df,
x = 'Date_confirmation',
y = 'Count',
text = 'Count',
color = 'Count',
opacity = 0.9,
title = 'Number of Confirmed Cases by Date',
color_continuous_scale = ['#bbbbbb','#38cae0'],
template = 'plotly_dark',
hover_data = ['Count'],#The hover_name property controls which column is displayed in bold as the tooltip title.
color_continuous_midpoint = 1000,
height = 450,
width = 800,
)
avg_returns = acumulated_df['Count'].mean()
avg_returns_color = '#bbbbbb'
fig.add_hline(y = avg_returns,
line_width = 1.,
line_dash = "dot",
opacity = 0.7,
fillcolor = avg_returns_color,
annotation_text = "Average: " + str(round(avg_returns,1)) + ' cases per day',
annotation_position = "bottom right",
annotation_font_size = 10,
annotation_font_color = "white",
line_color = avg_returns_color,
)
fig.update_xaxes(showticklabels = True,
dtick = "M1",
tickformat = "%b %Y",
)
fig.update_yaxes(title = 'Confirmed Cases')
fig.update_xaxes(title = 'Date')
plt.show(block = False)
fig.show()
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data_sort = pd.to_datetime(confirmed_data['Date_confirmation']).dt.day_name().value_counts().reindex(cats)
data_sort.head()
Monday 7459 Tuesday 7932 Wednesday 5021 Thursday 6701 Friday 7328 Name: Date_confirmation, dtype: int64
fig = px.bar(data_sort,
opacity = 0.9,
title = 'Number of Confirmed Cases by Days',
color_continuous_scale = ['#bbbbbb','#38cae0'],
template = 'plotly_dark',
#The hover_name property controls which column is displayed in bold as the tooltip title.
color_continuous_midpoint = 0,
height = 450,
width = 800,
)
plt.show(block = False)
fig.show()
data.fillna("",inplace=True) # replace NaN by ""
def function(train):
comment_words = ""
for i in train:
val = str(i)
tokens = val.split()
for k in range(len(tokens)):
tokens[k] = tokens[k].lower()
comment_words += " ".join(tokens)+" "
return comment_words
def plot_wordcloud(data):
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
comment_words = function(data)
wordcloud = WordCloud(width = 800,
height = 450,
contour_color='#023075',
background_color ='black',
colormap='autumn',
min_font_size = 20,
collocations=False).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (16, 9),
facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plot_wordcloud(data['Symptoms'])
temp_df = pd.DataFrame(data.loc[data['Symptoms'] != "",'Symptoms'].value_counts()).reset_index()
#if count=1 belongs to "other" category and count the total number
temp_df = temp_df.append(pd.DataFrame({'index':'Other', 'Symptoms':temp_df.loc[temp_df['Symptoms'] < 2]['Symptoms'].sum()},index = [0]))
temp_df = temp_df.loc[temp_df['Symptoms'] > 1]
fig = go.Figure(data = [go.Pie(labels = temp_df['index'],
values = temp_df['Symptoms'],
hole = .75,
#title = '% of Symptons',
marker_colors =px.colors.sequential.Agsunset,
)])
fig.update_layout(
title_text = "Majorly affecting symptoms",
template = 'plotly_dark',
width = 800,
height = 450,
annotations = [dict(text = 'Symptoms',
x = 0.5,
y = 0.5,
font_size = 20,
showarrow = False
)])
plt.show(block = False)
fig.show()
data.loc[data['Gender'] == 'male','Gender'] = 'Male'
data.loc[data['Gender'] == 'male ','Gender'] = 'Male'
data.loc[data['Gender'] == 'female','Gender'] = 'Female'
data.loc[data['Gender'] == 'female ','Gender'] = 'Female'
filtered_df = data.loc[data['Status'] == 'confirmed']
filtered_df = filtered_df.loc[filtered_df['Gender'] != ""]
temp_df = filtered_df[['Gender','ID']].groupby('Gender').agg('count').reset_index()
temp_df.rename({'ID':'Count'}, axis = 1, inplace = True)
temp_df['Percentage'] = (temp_df['Count']/temp_df['Count'].sum() * 100).round(2)
temp_df['Percentage'] = temp_df['Percentage'].astype('str') + '%'
n = temp_df['Count'].sum()
n_total = data.shape[0]
fig = px.bar(temp_df,
y = 'Gender',
x = 'Count',
title = 'Distribution of Sex Among Confirmed Cases<br><sub>Calculated on a sample of {} out of {} observations in the dataset</sub>'.format(n,n_total),
color = 'Gender',
text = 'Percentage',
template = 'plotly_dark',
opacity = 0.8,
height = 450,
width = 800,
color_discrete_sequence = ['#dbdbdb','#38cae0']
)
fig.update_yaxes(title = 'Count of Occurences')
fig.update_xaxes(title = 'Sex')
plt.show(block = False)
fig.show()